# Alexander F. Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(tidylog)
library(ggmap)
library(here)
library(readxl)
library(janitor)
library(glue)

# Replication mode detection ----
# In anonymized mode, geocoding cache is not available - skip this step
source(here("R", "visibility", "replication_mode.R"))
REPLICATION_MODE <- init_replication_mode()

if (REPLICATION_MODE == "anonymized") {
  message("=== ANONYMIZED MODE ===")
  message("Skipping ZIP geocoding (coordinates not available in anonymized package)")
  message("Distance calculations will use pre-cached results")
  # Exit gracefully - no geocoding needed in anonymized mode
} else {

g <- readRDS(here("data", "cache", "survey_visibility_processed.rds"))

zip.uq <- unique(g$zip_survey)
message(glue("Number of unique zip codes: {length(zip.uq)}"))

# Check API cache to see whether we've already geocoded these
message("Checking API cache for already geocoded ZIPs...")
if (file.exists(here("data", "cache", "geocoding", "zip_code_geoloc.csv"))) {
  cached <- read_csv(here("data", "cache", "geocoding", "zip_code_geoloc.csv"), show_col_types = FALSE)
} else {
  cached <- data.frame(zip = character(), lon = numeric(), lat = numeric())
}
zip.uq <- zip.uq[!zip.uq %in% cached$zip]
message(glue("Number of unique zip codes to geocode: {length(zip.uq)}"))

if (length(zip.uq) == 0) {
  message("[OK] All ZIP codes already geocoded. Skipping geocoding.")
  # Load cached data for final summary
  df.out <- data.frame(zip = character(), lon = numeric(), lat = numeric())
} else {

  # Get Google Maps API key from environment variable
  google_api_key <- Sys.getenv("GOOGLE_MAPS_API_KEY")
  if (google_api_key == "") {
    stop("GOOGLE_MAPS_API_KEY not set. Please add it to your .Renviron file.")
  }
  
  if (!has_google_key()) {
    register_google(key = google_api_key)
  }
  
  # Load ZIP code addresses for Google look up
  zips <- read_xls(here("data", "input", "zipcodes", "ZIP_Locale_Detail.xls"))
  zips <- clean_names(zips)
  zips <- filter(zips, delivery_zipcode %in% zip.uq)
  
  clean_address <- function(address) {
    address <- str_squish(address)
  
      replacements <- c(
      "\\bSt\\b" = "Street",
      "\\bRd\\b" = "Road",
      "\\bAve\\b" = "Avenue",
      "\\bDr\\b" = "Drive",
      "\\bLn\\b" = "Lane",
      "\\bBlvd\\b" = "Boulevard",
      "\\bApt\\b" = "Apartment",
      "\\bNo\\b" = "Number"
    )
    for (pattern in names(replacements)) {
      address <- str_replace_all(address, pattern, replacements[[pattern]])
    }
    
    address <- str_replace_all(address, "[^\\w\\s,]", "")
    
    address <- str_to_title(address)
    
    return(address)
  }
  
  zips$physical_delv_addr <- clean_address(zips$physical_delv_addr)
  zips$address <- with(zips, paste(physical_delv_addr, physical_city, ",", physical_state, delivery_zipcode, sep = " "))
  
  zips_input <- zips %>%
    dplyr::select(address, delivery_zipcode) %>%
    unique()
  
  # Add ZIPs back that may not have had an address
  zips_missed <- zip.uq[!zip.uq %in% zips_input$delivery_zipcode]
  zips_missed <- data.frame(address = paste(zips_missed, ", USA"), delivery_zipcode = zips_missed)
  zips_input <- bind_rows(zips_input, zips_missed)
  
  # Geocode in batches with incremental saving
  batch_size <- 50  # Adjust batch size as needed
  n_addresses <- nrow(zips_input)
  df.out <- data.frame(zip = character(), lon = numeric(), lat = numeric())
  
  message(glue("Starting geocoding of {n_addresses} addresses in batches of {batch_size}..."))
  
  for (i in seq(1, n_addresses, by = batch_size)) {
    batch_end <- min(i + batch_size - 1, n_addresses)
    batch_indices <- i:batch_end
    
    message(glue("Processing batch {ceiling(i/batch_size)} of {ceiling(n_addresses/batch_size)} (addresses {i}-{batch_end})..."))
    
    batch_addresses <- zips_input$address[batch_indices]
    batch_zips <- zips_input$delivery_zipcode[batch_indices]
    
    # Geocode the current batch with error handling
    tryCatch({
      batch_geocoded <- geocode(batch_addresses, output = "latlon", source = "google")
      
      # Create batch results
      batch_df <- data.frame(zip = batch_zips, batch_geocoded)
      names(batch_df) <- c("zip", "lon", "lat")
      
      # Add to cumulative results
      df.out <- bind_rows(df.out, batch_df)
      
      # Save progress incrementally by updating the cache file
      g.complete <- bind_rows(cached, df.out)
      g.complete <- unique(g.complete)
      
      write_csv(g.complete, file = here("data", "cache", "geocoding", "zip_code_geoloc.csv"))
      
      message(glue("[OK] Batch {ceiling(i/batch_size)} completed. Total geocoded: {nrow(df.out)}/{n_addresses}"))
      
      # Small delay to respect API rate limits
      Sys.sleep(0.1)
      
    }, error = function(e) {
      message(glue("[ERROR] Error in batch {ceiling(i/batch_size)}: {e$message}"))
      message("Continuing with next batch...")
    })
  }

}

# Final cleanup and summary
df.out <- unique(df.out)
g.complete <- bind_rows(cached, df.out)
g.complete <- unique(g.complete)

write_csv(g.complete, file = here("data", "cache", "geocoding", "zip_code_geoloc.csv"))
message(glue("Number of geocoded ZIPs: {nrow(g.complete)}"))
message(glue("Number of newly added geocoded ZIPs: {nrow(df.out)}"))
message(glue("[OK] ZIP code geocoding complete and saved to: {here('data', 'cache', 'geocoding', 'zip_code_geoloc.csv')}"))

} # End of REPLICATION_MODE == "full" block
